library(readxl)
# read_excel reads both xls and xlsx files
uniteddata=read_excel("Take_home_Stats_test_Data.xlsx")
DT::datatable(uniteddata)
For a unit change in X1 keeping X2 constant, the mean of Y increases by 2.149586 and unit change in X2 keeping X1 constant, the mean of Y increases by 3.465170
model1=glm(Y~X1+X2,data=uniteddata)
broom::tidy(model1)
## term estimate std.error statistic p.value
## 1 (Intercept) 16.539317 5.3464912 3.093490 2.118082e-03
## 2 X1 2.149586 0.1278502 16.813317 2.740094e-48
## 3 X2 3.465170 0.3715703 9.325743 7.829028e-19
head(broom::augment(model1))
## Y X1 X2 .fitted .se.fit .resid .hat
## 1 220.3576 52.82055 25.69345 219.1138 2.051231 1.2438138 0.006062251
## 2 201.8625 52.50277 21.01730 202.2271 1.609600 -0.3645488 0.003732850
## 3 179.3549 37.34995 21.16060 170.1513 1.756099 9.2035769 0.004443272
## 4 148.6372 32.85682 19.37569 154.3079 1.791368 -5.6707126 0.004623540
## 5 203.9518 49.15237 26.37978 213.6070 2.374467 -9.6551807 0.008123389
## 6 173.1624 30.51970 19.17214 148.5788 1.968793 24.5836232 0.005584767
## .sigma .cooksd .std.resid
## 1 26.37813 4.559422e-06 0.04735635
## 2 26.37820 2.400401e-07 -0.01386341
## 3 26.37413 1.823760e-04 0.35012737
## 4 26.37666 7.207070e-05 -0.21574780
## 5 26.37371 3.696804e-04 -0.36798830
## 6 26.34910 1.639249e-03 0.93575990
broom::glance(model1)
## null.deviance df.null logLik AIC BIC deviance df.residual
## 1 1107792 399 -1874.581 3757.161 3773.127 275540.6 397
Y=uniteddata$Y
X2=uniteddata$X2
#formula <- y ~ poly(x, 3, raw = TRUE)
formula <- Y ~ X2
#formula <- y ~ poly(x, raw = TRUE)
fill <- "#4271AE"
line <- "#1F3552"
#p <-
ggplot(uniteddata, aes(x=X1, y=Y)) + geom_point(shape=1) + geom_smooth(method=lm, se=FALSE) +
ggtitle(" regression line") +
scale_x_continuous(name = "X1") +
scale_y_continuous(name = "Y") +
theme_economist() +
theme(axis.line.x = element_line(size=.5, colour = "black"),
axis.line.y = element_line(size=.5, colour = "black"),
axis.text.x=element_text(colour="black", size = 9),
axis.text.y=element_text(colour="black", size = 9),
panel.grid.major = element_line(colour = "#d3d3d3"),
panel.grid.minor = element_blank(),
panel.border = element_blank(), panel.background = element_blank(),
plot.title = element_text(family = "Tahoma"),
text=element_text(family="Tahoma"))+
stat_poly_eq(aes(label = paste(..eq.label.., ..adj.rr.label.., sep = "~~~~")),
formula = formula, rr.digits = 3, coef.digits = 2, parse = TRUE)
#ggplotly(p)
Y=uniteddata$Y
X2=uniteddata$X2
formula <- Y ~ X2
fill <- "#4271AE"
line <- "#1F3552"
ggplot(uniteddata, aes(x=X2, y=Y)) + geom_point(shape=1) + geom_smooth(method=lm, se=FALSE) +
ggtitle(" regression line") +
scale_x_continuous(name = "X1") +
scale_y_continuous(name = "Y") +
theme(axis.line.x = element_line(size=.5, colour = "black"),
axis.line.y = element_line(size=.5, colour = "black"),
axis.text.x=element_text(colour="black", size = 9),
axis.text.y=element_text(colour="black", size = 9),
panel.grid.major = element_line(colour = "#d3d3d3"),
panel.grid.minor = element_blank(),
panel.border = element_blank(), panel.background = element_blank(),
plot.title = element_text(family = "Tahoma"),
text=element_text(family="Tahoma"))+
stat_poly_eq(formula = y ~ x, eq.with.lhs=FALSE,
aes(label = paste("hat(italic(y))","~`=`~",..eq.label..,"~~~", ..rr.label.., sep = "")),parse = TRUE)
fill <- "#4271AE"
line <- "#1F3552"
p <-ggplot(uniteddata, aes(x=X1, y=Y)) + geom_point(shape=1) + geom_smooth(method=lm, se=FALSE) +
ggtitle(" regression line") +
scale_x_continuous(name = "X1") +
scale_y_continuous(name = "Y") +
theme_economist() +
theme(axis.line.x = element_line(size=.5, colour = "black"),
axis.line.y = element_line(size=.5, colour = "black"),
axis.text.x=element_text(colour="black", size = 9),
axis.text.y=element_text(colour="black", size = 9),
panel.grid.major = element_line(colour = "#d3d3d3"),
panel.grid.minor = element_blank(),
panel.border = element_blank(), panel.background = element_blank(),
plot.title = element_text(family = "Tahoma"),
text=element_text(family="Tahoma"))
ggplotly(p)
Checking Regression Assumptions 1) The mean of the residual errors is close to 0.
mean(model1$residuals)
## [1] 2.161661e-14
#summary(model1)
#model1$residuals
#model1$fitted
#dataplot=data_frame(residuals=model1$residuals,fitted=model1$fitted,stdred=model1$std.resid)
par(mfrow = c(1, 2))
library(ggfortify)
ggplot2::autoplot(model1, which = 1:6, ncol = 3, label.size = F)+ theme_bw()
library(ggfortify)
myfortdata = fortify(USArrests)
myfortdata%>%head()%>%DT::datatable()
#head(myfortdata)%>%head()%>%DT::datatable()
dataplot=data_frame(residuals=model1$residuals,fitted=model1$fitted)
p1<-ggplot(dataplot, aes(fitted, residuals))+geom_point()+
geom_smooth(method="loess")+geom_hline(yintercept=0, col="red", linetype="dashed")+
xlab("Fitted values")+ylab("Residuals")+
ggtitle("Residual vs Fitted Plot")+theme_bw()+geom_point(shape=1,color="purple")
ggplotly(p1)
#===========================================================================
#
# broom augment and cbind.data.frame data add regression diagnostics to data
#===========================================================================
par(mfrow=c(2,2))
myudata=cbind.data.frame(uniteddata,broom::augment(model1))
p=ggplot(data = myudata, aes(x = .fitted, y = .resid)) +
geom_hline(yintercept = 0, colour = "firebrick3") +geom_smooth(method="loess")+
geom_point()+xlab("Fitted values")+ylab("Residuals")+ggtitle("Residual vs Fitted Plot")+theme_bw()
ggplotly(p)
#===========================================================================
#
# fortify data add regression diagnostics to data
#===========================================================================
myundata= fortify(model1)
pp=ggplot(data = myudata, aes(x = .fitted, y = .resid)) +
geom_hline(yintercept = 0, colour = "firebrick3") +
geom_point() +
geom_smooth(se = FALSE)+theme_bw()+
xlab("Fitted Values")+ylab("Residuals")+
ggtitle("Homoscedasticity assumption")
ggplotly(pp)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
myundata%>%head()
## Y X1 X2 .hat .sigma .cooksd .fitted
## 1 220.3576 52.82055 25.69345 0.006062251 26.37813 4.559422e-06 219.1138
## 2 201.8625 52.50277 21.01730 0.003732850 26.37820 2.400401e-07 202.2271
## 3 179.3549 37.34995 21.16060 0.004443272 26.37413 1.823760e-04 170.1513
## 4 148.6372 32.85682 19.37569 0.004623540 26.37666 7.207070e-05 154.3079
## 5 203.9518 49.15237 26.37978 0.008123389 26.37371 3.696804e-04 213.6070
## 6 173.1624 30.51970 19.17214 0.005584767 26.34910 1.639249e-03 148.5788
## .resid .stdresid
## 1 1.2438138 0.04735635
## 2 -0.3645488 -0.01386341
## 3 9.2035769 0.35012737
## 4 -5.6707126 -0.21574780
## 5 -9.6551807 -0.36798830
## 6 24.5836232 0.93575990
p2=ggplot(data = myundata, aes(sample = .stdresid)) +
stat_qq() +geom_abline(colour = "firebrick3")+
ggtitle("Theoritcal Quantiles/Normality assumption")+theme_bw()
ggplotly(p2)
p3=ggplot(data = myundata, aes(x = .fitted, y = .stdresid)) +
geom_hline(yintercept = 0, colour = "firebrick3") +
geom_point()+ geom_smooth(se=T,method = 'loess')+theme_bw()+
ggtitle("Standadized Residuals vs Fitted")+
xlab("Fitted Value")+
ylab((("Standardized residuals|")))
ggplotly(p3)
p4=ggplot(data = myundata, aes(x = .hat, y = .stdresid)) +
geom_point() +
geom_smooth(se = FALSE,method = 'loess')+ geom_smooth(se=T,method = 'loess')+theme_bw()+
ggtitle("Residuals vs. leverages")+
xlab("Fitted Value")+
ylab("Standardized residuals|")
ggplotly(p4)
# Points size reflecting Cook's distance
p5=ggplot(data = myundata, aes(x = .fitted, y = .resid, size = .cooksd)) +
geom_hline(yintercept = 0, colour = "firebrick3") +
geom_point() +scale_size_area("Cook’s distance")+
ggtitle("Points size reflecting Cook's distance")+
theme_bw()+xlab("Fitted Values")+ylab("Residuals")
ggplotly(p5)
# Residuals vs. leverages with observation number
p6=ggplot(data = myundata, aes(x = .hat, y = .stdresid)) +
geom_point() +
geom_smooth(se = FALSE,method = 'loess') +
geom_text(label = rownames(myundata), size = 4) +
geom_hline(yintercept = 2, lty = "dotted", colour = "slateblue1") +
geom_hline(yintercept = -2, lty = "dotted", colour = "slateblue1")+
theme_bw()+xlab("Fitted Values")+ylab("Standidized Residuals")+theme_bw()
ggplotly(p6)
p7=ggplot(data = myundata, aes(x =X1, y = Y)) +
geom_point() +
stat_smooth(method = "lm", formula = y ~ x + I(x^2), colour = "red", se = FALSE) +
stat_smooth(method = "lm", formula = y ~ x + I(x^2) + I(x^3), colour = "blue", se = FALSE)+
theme_bw()
ggplotly(p7)
#we define our own function mean of log
#and change geom to line
meanlog <- function(y) {mean(log(y))}
ggplot(myundata, aes(x=X1, y=Y)) +
stat_summary(fun.y="meanlog", geom="line")
tp <- ggplot(myundata, aes(Y))+geom_density()
tp
3)No autocorrelation of residuals present from the plot. the second line in the plot is close to 0.
# Method 1: Visualise with acf plot
acf(model1$residuals)
# Method 3: Durbin-Watson test
lmtest::dwtest(model1)
##
## Durbin-Watson test
##
## data: model1
## DW = 1.9857, p-value = 0.4432
## alternative hypothesis: true autocorrelation is greater than 0
5)The X variables and residuals are uncorrelated
cor.test(uniteddata$X1, model1$residuals)
##
## Pearson's product-moment correlation
##
## data: uniteddata$X1 and model1$residuals
## t = -2.5554e-14, df = 398, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.09805172 0.09805172
## sample estimates:
## cor
## -1.280924e-15
cor.test(uniteddata$X2, model1$residuals)
##
## Pearson's product-moment correlation
##
## data: uniteddata$X2 and model1$residuals
## t = -1.2686e-14, df = 398, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.09805172 0.09805172
## sample estimates:
## cor
## -6.358721e-16
5)The variability in X values is positive.The X variables in the sample must all not be the same or nearly the same.
var(uniteddata$X1)
## [1] 215.3105
var(uniteddata$X2)
## [1] 25.49096
vif(model1)
## X1 X2
## 2.023233 2.023233
The correlation between X1 and X2 is very high (0.7111551). This vilates the linear regression assumption.
library(corrplot)
##
## Attaching package: 'corrplot'
## The following object is masked from 'package:pls':
##
## corrplot
corrplot(cor(uniteddata[, -c(1,2)]))
cor(uniteddata[, -c(1,2)])
## X1 X2
## X1 1.0000000 0.7111551
## X2 0.7111551 1.0000000
Check assumptions automatically
#par(mfrow=c(2,2)) # draw 4 plots in same window
#library(gvlma)
model=lm(Y~X1+X2,data=uniteddata)
gvlma::gvlma(model)
##
## Call:
## lm(formula = Y ~ X1 + X2, data = uniteddata)
##
## Coefficients:
## (Intercept) X1 X2
## 16.539 2.150 3.465
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma::gvlma(x = model)
##
## Value p-value Decision
## Global Stat 8.759e+04 0.00000 Assumptions NOT satisfied!
## Skewness 3.126e+03 0.00000 Assumptions NOT satisfied!
## Kurtosis 8.446e+04 0.00000 Assumptions NOT satisfied!
## Link Function 9.031e-01 0.34194 Assumptions acceptable.
## Heteroscedasticity 3.494e+00 0.06161 Assumptions acceptable.
#DT:datatable(influence.measures(mod))
#DT::datatable(data.table(influence.measures(mod)))
#data.table(influence.measures(mod))
DT::datatable((influence.measures(model)$infmat))
Principal components regression/Analysis or varoius forms of Factor Analysis can be used when the explanatory variables are correlated.
model2=pls::plsr(Y~X1+X2,data=uniteddata,ncomp=1)
summary(model2)
## Data: X dimension: 400 2
## Y dimension: 400 1
## Fit method: kernelpls
## Number of components considered: 1
## TRAINING: % variance explained
## 1 comps
## X 95.07
## Y 72.09
3
## [1] 3
#biplot(model2)
#plot(model2, plottype = "biplot")
library(plsdepot)
model3=plsdepot::plsreg1(uniteddata[,3:4],uniteddata["Y"],comps=1, crosval=TRUE)
plot(model3)
The chi-square statistic and p-value in factanal are testing the hypothesis that the model fits the data perfectly. About 77.3% of the variance explained by one factors is very high.
factanal(uniteddata[,-1],1)
##
## Call:
## factanal(x = uniteddata[, -1], factors = 1)
##
## Uniquenesses:
## Y X1 X2
## 0.111 0.217 0.354
##
## Loadings:
## Factor1
## Y 0.943
## X1 0.885
## X2 0.803
##
## Factor1
## SS loadings 2.318
## Proportion Var 0.773
##
## The degrees of freedom for the model is 0 and the fit was 0
The data here was restructured into three columns which are the number of trials(1,..13), Driving school(Yes/No ) and the Frequency of the number of Trials.
uniteddata2=data_frame(No.Trial=c(seq(1:13)),No=c(29,16,17,4,3,9,4,5,1,1,1,3,7)
,Yes=c(198,107,55,38,18,22,7,9,5,3,6,6,12))
m1=reshape2::melt(uniteddata2,id.vars ="No.Trial",variable.name ="Driving.school",value.name ="Frequency")
m2= reshape2::dcast(reshape2::melt(uniteddata2,id.vars ="No.Trial"),No.Trial~variable )
DT::datatable(m1)
Chisquare Test.
xtabs(No.Trial~Driving.school+Frequency, data=m1)
## Frequency
## Driving.school 1 3 4 5 6 7 9 12 16 17 18 22 29 38 55 107 198
## No 30 17 11 8 0 13 6 0 2 3 0 0 1 0 0 0 0
## Yes 0 10 0 9 23 7 8 13 0 0 5 6 0 4 3 2 1
chisq.test(xtabs(No.Trial~Driving.school+Frequency, data=m1),simulate.p.value = T)%>%tidy()
## statistic p.value parameter
## 1 107.9594 0.0004997501 NA
## method
## 1 Pearson's Chi-squared test with simulated p-value\n\t (based on 2000 replicates)
The frequency of those passing the driving test who attended a driving school is higher than those who didn’t attend a driving school. Attending a driving school increases your probability of passing a driving test.
p=ggplot(m1, aes(No.Trial,Frequency, color=Driving.school)) +
geom_line() +
geom_point()+theme_bw()+xlab("Number of Trials")
ggplotly(p)
pq1=qplot( m1$Frequency, geom="histogram",binwidth = 2)+theme_minimal()+xlab("Number of Trials")
ggplotly(pq1)
Taking log of the Frequency of Number of trials for both those who had training from driving school and those who didn’t. The regular assumption on the explanatory variables is that of normality.Even with the logarithmic transformation, the distribution doest appear to be normal.
pq=qplot(log( m1$Frequency), geom="histogram",binwidth = 0.5)+theme_minimal()+xlab("log(Number of Trials)")
ggplotly(pq)
For a unit increase in the number of students attending a driving-school driver to pass the test in a trial,the mean Frequency of passing thedriving test increases by 1.90802228 compared to those who did not attend driving school.
model4=glm(log(Frequency)~(No.Trial)+Driving.school+No.Trial*Driving.school,data=m1)
#summary(model4)
broom::tidy(model4)
## term estimate std.error statistic p.value
## 1 (Intercept) 2.86791968 0.45198583 6.345154 2.194396e-06
## 2 No.Trial -0.19326892 0.05694486 -3.393966 2.608602e-03
## 3 Driving.schoolYes 1.90802228 0.63920449 2.984995 6.827809e-03
## 4 No.Trial:Driving.schoolYes -0.08783224 0.08053220 -1.090647 2.872298e-01
head(broom::augment(model4))
## log.Frequency. No.Trial Driving.school .fitted .se.fit .resid
## 1 3.367296 1 No 2.674651 0.4026610 0.6926451
## 2 2.772589 2 No 2.481382 0.3556205 0.2912069
## 3 2.833213 3 No 2.288113 0.3118999 0.5451004
## 4 1.386294 4 No 2.094844 0.2730980 -0.7085496
## 5 1.098612 5 No 1.901575 0.2415966 -0.8029628
## 6 2.197225 6 No 1.708306 0.2205465 0.4889184
## .hat .sigma .cooksd .std.resid
## 1 0.27472527 0.7660149 0.106138895 1.0586910
## 2 0.21428571 0.7830317 0.012468860 0.4276408
## 3 0.16483516 0.7754587 0.029745280 0.7764262
## 4 0.12637363 0.7687087 0.035213164 -0.9867728
## 5 0.09890110 0.7643337 0.033266457 -1.1010802
## 6 0.08241758 0.7783783 0.009912018 0.6643908
broom::glance(model4)
## null.deviance df.null logLik AIC BIC deviance df.residual
## 1 45.03364 25 -27.86532 65.73064 72.02112 12.98384 22
index <- createDataPartition(m1$Driving.school,p=0.70, list=FALSE)
trainSet <- m1[index,]
testSet <- m1[-index,]
model5=glm(Driving.school~No.Trial+log(Frequency),data=trainSet,family="binomial")
knitr::kable(confint(model5))
## Waiting for profiling to be done...
| 2.5 % | 97.5 % | |
|---|---|---|
| (Intercept) | -19.3296658 | -2.653230 |
| No.Trial | 0.1018008 | 1.352168 |
| log(Frequency) | 0.7677598 | 4.940666 |
tidy(model5)
## term estimate std.error statistic p.value
## 1 (Intercept) -8.9546485 4.0593821 -2.205914 0.02739001
## 2 No.Trial 0.5811987 0.3061354 1.898502 0.05762998
## 3 log(Frequency) 2.2936612 1.0037003 2.285205 0.02230079
For a unit increase in the frequency of a passing a dricing test, log odds of passing a driving test when one has taking a driving test test increases by about 200% compared to those who did not take the driving test.
coef(model5)
## (Intercept) No.Trial log(Frequency)
## -8.9546485 0.5811987 2.2936612
The odds of passing a driving test when one has taking a driving test test increases by about 800% compared to those who did not take the driving test
## odds ratios only)
## odds ratios and 95% CI
exp(cbind(OR = coef(model5), confint(model5)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.0001291355 4.029330e-09 0.07042341
## No.Trial 1.7881805650 1.107163e+00 3.86579749
## log(Frequency) 9.9111581761 2.154933e+00 139.86343263
# Predict using the test data
#testSet
pred<-predict(model5,testSet)
#
# # Print, plot variable importance
print(varImp(model5, scale = FALSE))
## Overall
## No.Trial 1.898502
## log(Frequency) 2.285205
#
# plot(varImp(model5, scale = FALSE), main="Variable Importance using logistic/glm")
#
#
# confusionMatrix(testSet$Hypertension,pred)
my_data=data.frame(cbind(predicted=pred,observed=testSet$Driving.school))
ggplot(my_data,aes(predicted,observed))+geom_point()+geom_smooth(method=lm)+ggtitle('logistic model')
predict <- predict(model5,testSet, type = 'response')
pp=if_else(predict >0.5,"Yes","No")
tab=table(testSet$Driving.school, pp)
tab
## pp
## No Yes
## No 2 1
## Yes 1 2
predict <- predict(model5,testSet, type = 'response')
predict=if_else(predict >0.5,"Yes","No")
#predict
data.frame(observed=testSet$Driving.school,predicted=predict)
## observed predicted
## 1 No No
## 2 No No
## 3 No Yes
## 4 Yes Yes
## 5 Yes No
## 6 Yes Yes
#cbind(m1$Driving.school, predict)
#==================================================================
#ROCR Curve
#==================================================================
p <- ggplot(data.frame(trainSet), aes(d = as.numeric(Driving.school), m = Frequency)) + geom_roc()+ style_roc()
plot_interactive_roc(p)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Warning in verify_d(data$d): D not labeled 0/1, assuming 1 = 0 and 2 = 1!
## Warning in verify_d(data$d): D not labeled 0/1, assuming 1 = 0 and 2 = 1!
The probability of be the probability of a no-driving-school driver to pass the test in a trial ,Pn is 0.1706485 and Ps be the probability of a driving-school driver to pass the test in a trial is 0.8293515. This is nearly 5 times the rate for those not attending driving school.
tapply(m1$Frequency,m1$Driving.school, sum)/sum(m1$Frequency)
## No Yes
## 0.1706485 0.8293515